library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("GGally")
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(here)
## here() starts at /g/huber/users/fridljand/R/ihw-forest-paper
options(bitmapType ="cairo")
chr <- 21
chr_full_data_grubert <- loadRData(paste0("/g/scb/zaugg/zaugg/hQTL/GenotypeData/snps", chr, ".filtered.allGT.txt.rda"))
Data driven parameter for \(s_i\)
s_prob <- mean(chr_full_data_grubert[[1]])/2
s_prob
Same number of patients
nsample <- ncol(chr_full_data_grubert)
nsample
Read fitted modell
chr_pvalues_grubert <- data.table::fread(here(paste0("data/hqtl_chrom1_chrom2/results/cisQTLs_H3K27AC_chr",chr,".txt")))
head(chr_pvalues_grubert)
betha <- mean(chr_pvalues_grubert$beta)
sanity check
chr_pvalues_grubert$`p-value`[1]
2*(1-pt(abs(chr_pvalues_grubert$`t-stat`[1]), df = 74))
We are following the notation in (Shabalin 2012). For a fixed SNP and a fixed histone modification peak, let \(s_i\in \{0,1,2\}\) be the frequency of the minor allele and \(g_i \in \mathbb{R}_{\geq 0}\) be the level of histone modification measured in individual \(i \in \{1,\ldots, n\}\). Let \(H \sim \mathop{Bernouilli}(1-\pi_0)\) indicate, if there actually is an association between \(s_i\) and \(g_i\). We model the relationship as follows:
\[ \begin{aligned} &g_i =\alpha+(H\beta) s_{i}+\epsilon_{i}.\end{aligned} \]
with \(s \sim \operatorname{Bin}(2,p)\) and \(\epsilon \sim \mathcal{N}\left(0, \sigma^2\right)\). We will be testing
\[ H_0: (H\beta) =0 \quad \text { against } \quad H_1: (H\beta) \neq 0. \]
Hyperparameters for simulation
ntrials = 1000
nsample = 76
pi0 = 0.9
#alpha should not play any role
simulate_data <- function(H, nsample = 76, s_prob = 0.26, alpha = 50, betha = 30, sigma = 2){
data_sim <- data.frame(
s_prob = s_prob,
alpha = alpha,
betha = betha, #true
sigma = sigma,
nsample_i = seq_len(nsample),
#H = rbinom(nsample,1,1-pi0), * H
#TODO simulate alternatives
s = rbinom(n = nsample, size = 2, s_prob),
noise = rnorm(n = nsample, sd = sigma)
)
data_sim <- data_sim %>%
mutate(histone_modification = alpha + betha * s*H + noise)
return(data_sim)
}
H <- rbinom(ntrials, size = 1, 1-pi0)
data_sim <- lapply(H, simulate_data)
head(data_sim[[1]])
## s_prob alpha betha sigma nsample_i s noise histone_modification
## 1 0.26 50 30 2 1 0 -1.618697671 48.38130
## 2 0.26 50 30 2 2 0 1.359144989 51.35914
## 3 0.26 50 30 2 3 0 0.493898067 50.49390
## 4 0.26 50 30 2 4 2 0.732178152 50.73218
## 5 0.26 50 30 2 5 0 -2.733669559 47.26633
## 6 0.26 50 30 2 6 0 -0.009020177 49.99098
Let \(\bar{s} := \frac{1}{n}\sum_{i=1}^n s_{i}\). Then the minor allele frequency can be defined as
\[ f:=\frac{1}{2 n} \min \left(\sum_{i=1}^n s_i, 2 n-\sum_{i=1}^n s_i\right)=\frac{1}{2} \min (\bar{s}, 2-\bar{s}) \in[0,0.5] \]
That is, of the $2n$ alleles present in $n$ individuals, what the proportion of the minor allele at the pre-specified SNP is.
minor_allele_frequencies_sim <- sapply(data_sim, function(data_sim_i){
0.5* min(mean(data_sim_i$s), 2- mean(data_sim_i$s))
})
We test
\[ H_0: (H\beta) =0 \quad \text { against } \quad H_1: (H\beta) \neq 0. \]
Our test statistic is
\[ t_{\text {score }}=\frac{\widehat{\beta}}{S E_{\widehat{\beta}}}, \]
where \(\widehat{\beta}\) is the least square estimator for \(H\beta\) and \(S E_{\widehat{\beta}}\) is the associated standard error. We are following the notation from B7.13 in (Czado and Schmidt 2011). Under the null and assuming \(\epsilon_{1},\ldots , \epsilon_{n} \sim \text { i.i.d. } \mathcal{N}\left(0, \sigma^2\right)\) we have \(t_{\text {score }} \sim \mathcal{T}_{n-2}\).
sim_res <- lapply(data_sim, function(data_sim_i){
t_test_lm = lm(histone_modification ~ 1 + s, data=data_sim_i)
s_line_t_test_lm = summary(t_test_lm)$coefficients[2,]
s_line_t_test_lm = data.frame(as.list(s_line_t_test_lm))
#calculate minor_allele_frequency
s_line_t_test_lm$minor_allele_frequency = 0.5* min(mean(data_sim_i$s), 2- mean(data_sim_i$s))
s_line_t_test_lm$s_variance <- var(data_sim_i$s)
s_line_t_test_lm
})
sim_res <- bind_rows(sim_res)
#sim_res <- sim_res %>% rename(pvalue = `Pr...t..`)
colnames(sim_res)[4] <- "pvalue"
sim_res$H <- as.factor(H)
head(sim_res)
## Estimate Std..Error t.value pvalue minor_allele_frequency
## 1 0.10803028 0.3369821 0.3205817 0.749430959 0.2763158
## 2 0.05270469 0.3878867 0.1358765 0.892287851 0.2434211
## 3 1.07007858 0.4042360 2.6471629 0.009912786 0.2039474
## 4 0.39721912 0.4796317 0.8281754 0.410234260 0.2105263
## 5 -0.09696710 0.3994907 -0.2427268 0.808888564 0.1842105
## 6 0.89035418 0.4347057 2.0481768 0.044090778 0.2302632
## s_variance H
## 1 0.4905263 0
## 2 0.3598246 0
## 3 0.3247368 0
## 4 0.3003509 0
## 5 0.2624561 0
## 6 0.3317544 0
We don’t think that the minor allele frequency \(f\) informs about the hypothesis \(H\). We think that under the alternative \(H=1\) it is informative for our multiple testing procedure in the following way:
Higher MAF \(f =\frac{1}{2}\min \left( \bar{s},2- \bar{s}\right)\)
\(\Rightarrow\) Higher predictor variation \(\sqrt{\sum_{i=1}^n\left(s_i-\bar{s}\right)^2}\) (intuitive, both are measures of variation of \(s\))
\(\Rightarrow\) Smaller variation of \(\widehat{\beta}\) estimate \[ S E_{\widehat{\beta}}=\frac{\sqrt{\frac{1}{n-2} \sum_{i=1}^n\left(g_i-\bar{g}\right)^2}}{\sqrt{\sum_{i=1}^n\left(s_i-\bar{s}\right)^2}} \]
\(\Rightarrow\) Higher test statistic \[ t_{\text {score }}=\frac{\widehat{\beta} }{S E_{\widehat{\beta}}} \]
\(\Rightarrow\) Smaller p-value
\[P = 2(1-F_{H_0}(t_{\text {score }}))\]
Question: 1) Is above interpretation correct? 2) What happens under the null above?
ggpairs(sim_res,
mapping=ggplot2::aes(colour = H, alpha = 0.3),
columns= c(5,6,2,3,4),
title="correlogram with ggpairs()",
lower=list(combo=wrap("facethist", binwidth=1))
)
groups_by_filter <- function(covariate, nbins, ties.method="random", seed=NULL){
if (!is.null(seed) && ties.method=="random"){
#http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
tmp <- runif(1)
old <- .Random.seed
on.exit( { .Random.seed <<- old } )
set.seed(as.integer(seed))
}
rfs <- rank(covariate, ties.method=ties.method)/length(covariate)
as.factor(ceiling( rfs* nbins))
}
ggplot(sim_res,
aes(x = pvalue,
fill = H)) +
geom_histogram(boundary = 0)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(sim_res,
aes(x = pvalue,
fill = H)) +
geom_histogram(boundary = 0) +
facet_wrap(groups_by_filter(sim_res$minor_allele_frequency, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.